# rebranding_scrape.R (Frederick Solt)
# Gets and formats data on parties' vote shares and re-naming from parties-and-elections.eu 
# dependency: widetabfix.pl (which corrects issues with wide-format tables on the site)
# 1.0: 6 June 2013
# 1.1: 22 Nov 2013 (fix country names, sort)
# 1.2: Mar 2014 (drop non-party entries)

library(XML)
library(stringr)
library(doBy)
library(ggplot2)
library(scales)

# Load top page
url1 <- ("http://www.parties-and-elections.eu/countries.html")
pe1 <- readLines(url1)

lines <- grep("href=\\\".*\\.html", pe1, value=T)
countries <- gsub(pattern=".*href=\\\"([a-z]*[0-9]?)\\.html.*", 
                  replacement="\\1", x=lines)
exclude <- c("index", "calendar", "countries", "references", "links", "content",
             "austria3", "brussels", "dg", "flanders", "wallonia", "ncyprus", "faroes",
             "greenland", "eu", "aland", "germany3", "italy3", "portugal3", "vojvodina",
             "spain3", "crimea", "gibraltar", "nireland", "scotland", "wales",
             "belarus", "russia", "ukraine", "turkey", "kosovo")
countries <- countries[!countries %in% exclude] 
links <- gsub(pattern="(.*)", 
              replacement="http://www.parties-and-elections.eu/\\1.html", x=countries)
links2 <- gsub(pattern="(.*)", 
               replacement="http://www.parties-and-elections.eu/\\12.html", x=countries)
twopart <- c("greece", "belgium", "italy", "denmark") # I confirmed there is only one problem over the page divide, fixed below
links2 <- gsub(pattern="(.*)", 
               replacement="http://www.parties-and-elections.eu/\\12.html", x=countries)
for (i in 1: length(twopart)) {
    links2 <- gsub(pattern=paste(twopart[i],"2.html", sep=""), replacement=paste(twopart[i],"2a.html", sep=""), x=links2)
}
links3 <- gsub(pattern="(.*)", 
               replacement="http://www.parties-and-elections.eu/\\12b.html", x=twopart)
countries <- c(countries, twopart)
links2 <- c(links2, links3)

for(i in 1:length(countries)) {
    cat("Processing", countries[i], "archives \n")
    archive <- readLines(links2[i])
    wide <- (length(grep(">Seats<", archive)) >1)
    years <- grep(pattern="<td.*bgcolor=\"#E7E9F1\".*>[0-9]{4}", x=archive, value=T)
    years <- as.numeric(gsub(pattern=".*([0-9]{4}).*", replacement="\\1", x=years))
    years[which(duplicated(years))] <- years[which(duplicated(years))]+.1 # distinguish second election in a given year (rebranding assumed to occur in first election of year)
    tabs <- readHTMLTable(links2[i])
    tab <- tabs[[length(tabs)-1]]
    
    if (!wide) {		
        votes <- tab[which(tab$V2=="%"),]
        votes <-cbind(votes$V1, as.data.frame(apply(apply(votes[c(3:length(votes))], 2, gsub, patt=",", replace="."), 2, as.numeric)))
    } else {
        if (mean(is.na(tab[3:nrow(tab),2:ncol(tab)]))!=0) {
            write.table(tab, file="tab.csv", sep=",", row.names=F, col.names=F)
            system("cd ~/'Documents/Projects/Party Rebranding/Rebranding1/Paper'; perl widetabfix.pl tab.csv")
            tab <- read.table("tab1.csv", sep=",")
        }	
        party <- as.character(tab[3:nrow(tab),1])
        votes <- cbind(party, tab[3:nrow(tab),which(tab[2,]=="%")])
        votes <- cbind(votes$party, as.data.frame(apply(apply(votes[c(2:length(votes))], 2, gsub, patt=",", replace="."), 2, as.numeric)))
    }		
    
    votes[is.na(votes)] <- 0
    names(votes) <- c("party", paste("vote", years, sep="."))
    votes$party <- as.character(votes$party)
    votes <- votes[!duplicated(votes$party),] # catches repeated-party error in Estonia table (and maybe others)
    votes.l <- reshape(votes, varying = 2:length(votes),
                       v.names= "votes", idvar="party", timevar = "year", 
                       times = years, direction = "long")
    votes.l$party <- gsub(pattern="(.*)\\r\\n\\W*(.*)", replacement="\\1 \\2", x=votes.l$party)	
    
    notes <- as.character(tabs[[1]][length(tabs[[1]][,1]),1])	
    notes <- gsub(pattern=".*Abbreviations:\\W(.*)Source.*", replacement="\\1", x=notes)
    notes <- gsub(pattern="\\r\\n", replacement="", x=notes)
    if(countries[i]=="iceland") notes <- gsub(pattern="BF \\(([12])\\)", replacement="BF\\1", x=notes)
    if(countries[i]=="denmark") notes <- gsub(pattern="\\((since [0-9]{4})\\)", replacement="\\1", x=notes)
    if(countries[i]=="italy" & (links2[i] %in% links3)) notes <- gsub(pattern="^(.*)\\*.*", replacement="\\1", x=notes)
    notes0 <- notes
    notes <- unlist(strsplit(notes, "\\)"))
    if (notes!=notes0) { # as long as there is at least one rebranded party . . .
        notes <- gsub(pattern="^; ", replacement="", x=notes)
        
        changed.parties <- gsub(pattern="^(.*):[^;]* \\(.*", replacement="\\1", x=notes) # First, get the current acronym
        changed.parties <- gsub(pattern=".*;\\W*(.*)", replacement="\\1", x=changed.parties)
        changed.parties <- changed.parties[1:length(changed.parties)-1] # because of leftover tail of string after splitting	
        changed.parties <- gsub(pattern=".*;\\W*(.*)", replacement="\\1", x=changed.parties)
        changed.parties <- gsub(pattern="^\\W*(.*)", replacement="\\1", x=changed.parties)
        
        c.p <- gsub(pattern="^(.*):[^;]* (\\(.*)", replacement="\\1 \\2", x=notes)  # then get all old acronyms
        c.p <- c.p[1:length(c.p)-1] # because of leftover tail of string after splitting	
        c.p <- gsub(pattern=".*\\((.*)", replacement="\\1, ", x=c.p) # drop anything preceding an open paren (e.g., parties w/o name changes)
        c.p <- gsub(pattern="[^,]*,\\W+([^,;]*)[,;][^,]*", replacement=" \\1,", x=c.p) # retain only acronyms
        c.p <- gsub(pattern=".*:[^,]*,([^,;]*)[,;][^,]*", replacement=" \\1,", x=c.p) # retain only acronyms, second pass
        c.p <- gsub(pattern=" (.*),", replacement="(\\1)", x=c.p) # drop trailing comma and surround with parens
        if(countries[i]=="germany") c.p <- gsub(pattern=" Party for Unity,", replacement="", x=c.p) # kludge for Germany
        if(countries[i]=="norway") c.p <- gsub(pattern="\\( FMS\\)", replacement="(FMS, RV)", x=c.p) # kludge for Norway
        if(countries[i]=="sweden") c.p <- gsub(pattern="\\(KDS, KDS\\)", replacement="(KDS)", x=c.p) # kludge for Sweden
        c.p <- gsub("((\\b\\w+), \\2,)", "\\2,", x=c.p) # to delete repeated acronyms when name changes but acronym is retained		
        changed.parties <- paste(changed.parties, c.p, sep=" ") # put new and old acronyms together
        
        changed.parties <- gsub(pattern="(^(.*\\b) \\(.*)((?<!\\-)\\b\\2\\b(?!(\\-|\\+)))(.*)", replacement="\\1\\5", x=changed.parties, perl=TRUE) # this bit gets rid of an old acronym if current party name has same acronym
        changed.parties <- gsub(pattern="\\(, ", replacement="(", x=changed.parties)
        changed.parties <- gsub(pattern=" \\($", replacement="", x=changed.parties)
        changed.parties <- gsub(pattern=", ,", replacement=",", x=changed.parties)
        changed.parties <- gsub(pattern=", \\)", replacement=")", x=changed.parties)
        changed.parties <- gsub(pattern=",\\)", replacement=")", x=changed.parties)
        changed.parties <- gsub(pattern="\\(\\)", replacement=")", x=changed.parties)
        changed.parties <- gsub(pattern="\\)\\W*", replacement=")", x=changed.parties)
        changed.parties <- gsub(pattern=" \\)$", replacement="", x=changed.parties)
        
        if(countries[i]=="unitedkingdom") changed.parties[which(changed.parties=="GP (EP)")] <- "GP" # catches bug in Britian table
        if(countries[i]=="norway") changed.parties[which(changed.parties=="SV (SF)")] <- "SV (SF, SV)" # catches bug in Norway table
        if(countries[i]=="sweden") changed.parties[which(changed.parties=="V (SKP, VPK)")] <- "V (SKP, VKP)" # catches bug in Sweden table
        if(countries[i]=="moldova") changed.parties[which(changed.parties=="PPCD (FPM, FPCD)")] <- "PPCD (FPCD, FPM)" # catches bug in Moldova table
        if(countries[i]=="moldova") changed.parties[which(changed.parties=="PFD (CI + Christian-Democratic Party of Moldova)")] <- "PFD (CI-PDCM)" # kludge for Moldova 
        if(countries[i]=="hungary") changed.parties[which(changed.parties=="FIDESZ (FIDESZ)")] <- "FIDESZ" # kludge for Hungary 
        if(countries[i]=="latvia") changed.parties[which(changed.parties=="LSDSP (DT, LSDA, A)")] <- "LSDSP (DT, LSDA)" # kludge for Latvia 
        if(countries[i]=="poland") changed.parties[which(changed.parties=="ChD-SP (SP, ChD)")] <- "ChD-SP (ChD)" # kludge for Poland 
        if(countries[i]=="romania") changed.parties[which(changed.parties=="CDR 2000:(CDR 2000)")] <- "CDR" # kludge for Romania 
        if(countries[i]=="slovakia") changed.parties[which(changed.parties=="LS-HZDS (HZDS, HZDS-RSS)")] <- "LS-HZDS (HZDS-RSS, HZDS)" # bug fix for Slovakia 
        if(countries[i]=="italy") changed.parties[which(changed.parties=="1968: Unified Socialist Party, PSU (PSIUP, PSU)")] <- "PSI (PSIUP, PSU)" # kludge for Italy 
        if(countries[i]=="spain") changed.parties[which(changed.parties=="CiU (PDPC + Centre Union)")] <- "CiU (PDPC, UDC)" # kludge for Spain 
        if(countries[i]=="switzerland") changed.parties[which(changed.parties=="PS (AP)")] <- "FPS (AP)" # kludge for Switzerland 
        if(countries[i]=="estonia") changed.parties[which(changed.parties=="EPPL (EPL)")] <- "EPPL" # kludge for Estonia 
        if(countries[i]=="bulgaria") changed.parties[which(changed.parties=="BZNS-NP (BRSDP-O)")] <- "BZNS-NP (BZNS-NP/BRSDP-O)" # bug fix for Bulgaria 
        
        change.years <- str_extract_all(notes, "([0-9]{4}|[0-9]{4}\\-[0-9]{4})")
        change.years <- change.years[1:length(change.years)-1] # because of leftover tail of string after splitting
        last.year <- lapply(change.years, function(x) gsub(pattern=".*\\-([0-9]{4})$","\\1", x)[length(x)])
        change.years <- lapply(change.years, function(x) gsub(pattern="\\-[0-9]{4}$","", x))
        
        years2 <- unique(years)
        last.change <- lapply(last.year, function(x) years2[which(years2==x)+1])
        
        change.years <- lapply(change.years, function(x) paste(x, collapse=","))
        change.years <- lapply(change.years, function(x) gsub(pattern="^[0-9]{4},?","", x)) # first year of party isn't a *re-*branding
        change.years <- paste(change.years, last.change, sep=",")
        change.years <- lapply(change.years, function(x) gsub(pattern="^,","", x)) 
        change.years <- strsplit(as.character(change.years), ",")
        
        changes <- cbind(party=rep(changed.parties, sapply(change.years, length)), year=unlist(change.years), change=1)
        
        c.data <- merge(as.data.frame(votes.l), as.data.frame(changes), by=c("party", "year"), all=TRUE)
        c.data$change <- as.numeric(c.data$change)
        c.data$change[is.na(c.data$change)] <- 0
        c.data <- c.data[which(c.data$party!="Turnout"),]
        c.data$country <- gsub(pattern="\\b([a-z])", replacement="\\U\\1", x=as.character(countries[i]), perl=TRUE)
    } else { # if no rebranded parties . . .
        c.data <- votes.l
        c.data$change <- 0
        c.data <- c.data[which(c.data$party!="Turnout"),]
        c.data$country <- gsub(pattern="\\b([a-z])", replacement="\\U\\1", x=as.character(countries[i]), perl=TRUE)
        
    }	
    if (i==1) change.data <- c.data else change.data <- rbind(change.data, c.data)
}
c.d <- change.data


####
change.data <- c.d
change.data$country[change.data$country=="Czechrepublic"] <- "Czech Republic"
change.data$country[change.data$country=="Sanmarino"] <- "San Marino"
change.data$country[change.data$country=="Unitedkingdom"] <- "United Kingdom"

change.data <- change.data[which(!is.na(change.data$votes)),] 
change.data <- change.data[with(change.data, order(country, party, year)), ]

change.data$party <- gsub(pattern="Ö", replacement="O", x=change.data$party)
change.data$party <- gsub(pattern="Ü", replacement="U", x=change.data$party)
change.data$party <- gsub(pattern="PRLW \\(PRLW-PL\\)", replacement="MR (PRLW-PL, PRL, PRL-FDF)", x=change.data$party)
change.data$party <- gsub(pattern="MR \\(PRL, PRL-FDF\\)", replacement="MR (PRLW-PL, PRL, PRL-FDF)", x=change.data$party)
change.data$party[change.data$party=="BF (1)"] <- "BF1"
change.data$party[change.data$party=="BF (2)"] <- "BF2"
change.data$change[change.data$party=="MR (PRLW-PL, PRL, PRL-FDF)" & change.data$year=="1981"] <- 1
change.data$change[change.data$party=="AOV (AOV-55+)" & change.data$year=="1998"] <- 1
change.data$party[change.data$party=="KRF" & change.data$country=="Denmark"] <- "KD (KRF)" #fixes twopage issue
change.data$change[change.data$party=="KD (KRF)" & change.data$country=="Denmark" & change.data$year==1990] <- 1
change.data$party[change.data$party=="PR (LP)" & change.data$country=="Italy"] <- "RAD (LP, LPS, LB)" #fixes twopage issue
change.data$change[change.data$party=="RAD (LP, LPS, LB)" & change.data$country=="Italy" & change.data$year==1992] <- 1
change.data$party[change.data$party=="VERDI" & change.data$country=="Italy"] <- "VERDI (LV)" #fixes twopage issue
change.data$party[change.data$party=="PCI (FDP, PDS)" & change.data$country=="Italy"] <- "DS (FDP, PCI, PDS)" #fixes twopage issue
change.data$party[change.data$party=="DS (PDS)" & change.data$country=="Italy"] <- "DS (FDP, PCI, PDS)" #fixes twopage issue
change.data$year[change.data$year=="2007" & change.data$country=="Portugal"] <- "2009" #fixes typo in Parties and Elections data

not.parties <- c("Ind.", "Independents", "Others", "Others/Ind.", "Åland", "Independents/Aosta Valley*")
change.data <- change.data[!change.data$party %in% not.parties,]
change.data$year <- as.numeric(change.data$year)

#Create variables capturing every acronym the party has ever had separately
sbt <- strsplit(change.data[,1], " \\(|, |)")
n <- max(sapply(sbt, length))
l <- lapply(sbt, function(x) c(x, rep(NA, n - length(x))))
change.data <- cbind(change.data, data.frame(t(do.call(cbind, l))))
names(change.data) <- gsub(pattern="X", replacement="name", x=names(change.data))

#Drop parties when they have not yet received any votes (not formed yet)
change.data <- change.data[with(change.data, order(country, party, year)),]
change.data$total_votes <- ddply(change.data, .(country, party), function(x) cumsum(x["votes"]))[,3]
change.data <- change.data[change.data$total_votes>0,-12]

#Drop parties when they never receive votes again (disbanded)
change.data <- change.data[with(change.data, order(country, party, -year)),]
change.data$total_votes_rev <- ddply(change.data, .(country, party), function(x) cumsum(x["votes"]))[,3]
change.data <- change.data[change.data$total_votes_rev>0,-12]
change.data <- change.data[with(change.data, order(country, party, year)),]

#Save
rm(list=setdiff(ls(), c("change.data", "c.d")))
save(change.data, file="change_data.RData")

